library(tidyverse)
library(haven)
library(pals)
library(sandwich)
library(lmtest)

#### 0. Read in convenience functions and set metadata ####

source("0_conjoint_functions.R")
source("0_AZ_recoding_functions.R")

countries <- c("AUS","BR","CAN","CHL","CHN","COL","FR","GHA","IND","IT","JPN","SP","UGA","UK","US", "ZAF")
en_surveys <- c("AUS","CAN","GHA","IND","UGA","UK","US","ZAF")

ref_levels = c(gdp = "0%",
               jobs = "0%",
               supplies = "Quick to procure",
               deaths = "10 per million",
               vaccinated = "5%",
               lockdown = "10 weeks")

#### 1. Refresh data from source ####

## Copies over data (country files + combined) from main CANDOUR II repository
# refresh_data("/Users/tsr/Dropbox (Personal)/candour2/data", countries)

# pooled_data <- create_conjoint_data(countries, ref_levels)

# Check integrity of conjoint recoding
# source("recoding_checks.R")
# 
# write_csv(pooled_data, "formatted_data/conjoint_only.csv")
# write_dta(pooled_data, "formatted_data/conjoint_only.dta")

#### 2. Add additional variables ####

pca_mods <- list() # empty list to store (then save) PCA models used in recoding

## Add in Alexei's additional variables
subject_data <- read_csv("survey_data/data_combined.csv")
recode_data <- az_recode(subject_data)

# write_csv(recode_data, "formatted_data/combined_data_w_additional_vars.csv")

pooled_data <- read_csv("formatted_data/conjoint_only.csv")

panel_data <- read_csv("../pgg_paper/data/panel_data_wide.csv") %>% 
  mutate(country = country_to_code(country_w2),
         c_id = paste0(country,"_",id)) %>%  
  filter(wave_w2 == 2,
         !duplicated(c_id)) %>% 
  select(c_id, deaths_w2, cases_w2, pop2019_w2, E1_1_w2, E1_2_w2)

##### Food severity #####

# Function to calculate impact of Covid on food questions
food_severity <- function(q_food, q_covid, yes_opts = "Yes|Sí|Oui|Sì", no_opts = "No|Non") {
  
  q_food = case_when(str_detect(q_food, yes_opts) ~ 1,
                     str_detect(q_food, no_opts) ~ 0)
  q_covid = case_when(str_detect(q_covid, yes_opts) ~ 1,
                      str_detect(q_covid, no_opts) ~ 0)
  
  q_sev = case_when(q_food == 1 & q_covid == 1 ~ 1,
                    q_food == 1 & q_covid == 0 ~ 0.5,
                    q_food == 0 ~ 0)
  
}

# Apply to the 8 relevant questions
subject_data$food1 <- food_severity(subject_data$eco_out_17, subject_data$eco_out_18)
subject_data$food2 <- food_severity(subject_data$eco_out_20, subject_data$eco_out_21)
subject_data$food3 <- food_severity(subject_data$eco_out_23,subject_data$eco_out_24)
subject_data$food4 <- food_severity(subject_data$eco_out_26, subject_data$eco_out_27)
subject_data$food5 <- food_severity(subject_data$eco_out_29,subject_data$eco_out_30)
subject_data$food6 <- food_severity(subject_data$eco_out_32,subject_data$eco_out_33)
subject_data$food7 <- food_severity(subject_data$eco_out_36,subject_data$eco_out_37)
subject_data$food8 <- food_severity(subject_data$eco_out_40,subject_data$eco_out_41)

subject_data$food_indx <- subject_data$food1 + 
  subject_data$food2 + 
  subject_data$food3 + 
  subject_data$food4 + 
  subject_data$food5 + 
  subject_data$food6 + 
  subject_data$food7 + 
  subject_data$food8

food_pca_model <- princomp(~., 
                           subject_data[,c("food1","food2","food3","food4",
                                               "food5","food6","food7","food8")],
                           na.action = na.exclude)

pca_mods[['food']] <- food_pca_model

subject_data$food_pca <- predict(food_pca_model)[,1]

##### Household income change

subject_data$hh_inc_delta <- subject_data$eco_out_13

##### EQ-5D-5L #####

subject_data$eq5d_rate_now <- subject_data$eq5d_6
subject_data$eq5d_rate_1y <- subject_data$eq5d_7
subject_data$eq5d_rate_delta <- subject_data$eq5d_rate_now - subject_data$eq5d_rate_1y

##### WHO health #####

subject_data <- subject_data %>% 
  mutate(across(starts_with("health_3_"),
                ~ case_when(grepl("All", .x) ~ 5,
                            grepl("Most", .x) ~ 4,
                            grepl("More", .x) ~ 3,
                            grepl("Less than", .x) ~ 2,
                            grepl("Some of", .x) ~ 1,
                            grepl("At no", .x) ~ 0,
                            TRUE ~ NA_real_)
  ))

subject_data$who_indx <- subject_data$health_3_1 +
  subject_data$health_3_2 +
  subject_data$health_3_3 +
  subject_data$health_3_4 +
  subject_data$health_3_5

who_pca_mod <- princomp(~., 
                        subject_data[,c("health_3_1","health_3_2","health_3_3","health_3_4","health_3_5")],
                        na.action = na.exclude)

pca_mods[['who']] <- who_pca_mod

subject_data$who_pca <- predict(
  who_pca_mod
)[,1]



##### Household income #####

subject_data$hh_inc_obj <- as.integer(subject_data$INCOME_HH_HIGH > subject_data$INCOME_HH_MEAN)

##### Domestic vs others -- politics #####

subject_data$politics_diff_1 <- subject_data$politics_1 - subject_data$politics_2
subject_data$politics_diff_2 <- subject_data$politics_3 - subject_data$politics_4
subject_data$politics_diff_3 <- subject_data$politics_5 - subject_data$politics_6
subject_data$politics_diff_4 <- subject_data$politics_7 - subject_data$politics_8
subject_data$politics_diff_5 <- subject_data$politics_8 - subject_data$politics_10

##### Attention measure #####

subject_data$attention <- ifelse(
  subject_data$att_chk_2 %in% c(
    "None of the above",
    "Ninguna de las anteriores", #Spanish
    "Nessuna di queste", #Italian
    "Nenhuma das opções acima", #Portuguese
    "上記のいずれでもない", # Japanese
    "以上皆不是", # Chinese
    "Rien de tout cela" # French
  ),
  1,0)

##### Vaccine status #####

subject_data$subj_vaccinated <- ifelse(str_detect(subject_data$vac_hist_2,"Yes,"),1,0)
subject_data$subj_vacc_refused <- ifelse(subject_data$vac_hist_1 == "Yes" & 
                                        str_detect(subject_data$vac_hist_2, "No,"),
                                      1,0)

##### Health compliance #####

health_compliance <- function(q, yes_opts = "Yes|Sí|Oui|Sì", no_opts = "No|Non") {
  
  compliance = case_when(str_detect(q, yes_opts) ~ 1,
                         str_detect(q, no_opts) ~ 0)
  
}

for (i in 1:17) {
  subject_data[[paste0("health_compl_",i)]] <- health_compliance(subject_data[[paste0("vac_hes_6_",i)]])
}

health_compl_pca_mod <- princomp(~., 
                                 subject_data[,paste0("health_compl_",1:17)],
                                 na.action = na.exclude)

summary(health_compl_pca_mod)

pca_mods[["health_compl"]] <- health_compl_pca_mod

subject_data$health_compl_pca <- predict(health_compl_pca_mod)[,1]



##### Engagement with spending on health #####
subject_data$health_spend_1 <- case_when(subject_data$health_pol_1 == "Yes" ~ 1,
                                         subject_data$health_pol_1 == "No" ~ -1)
subject_data$health_spend_2 <- (2*subject_data$health_pol_17-100)/100
subject_data$health_spend_3<- case_when(str_detect(subject_data$health_pol_18, "Spend more") ~ 1,
                                        str_detect(subject_data$health_pol_18, "Spend the same") ~ 0,
                                        str_detect(subject_data$health_pol_18, "Spend less") ~ -1)

health_spend_pca_mod <- princomp(~., 
                                 subject_data[,c(paste0("health_spend_",1:3))],
                                 na.action = na.exclude)

summary(health_spend_pca_mod)

pca_mods[["health_spend"]] <- health_spend_pca_mod

subject_data$health_spend_pca <- predict(health_spend_pca_mod)[,1]

##### No. of international flights #####

subject_data$intl_flights <- case_when(str_starts(subject_data$health_pol_19, "10 or more") ~ 10,
                                       str_starts(subject_data$health_pol_19, "\\d") ~ as.numeric(str_sub(subject_data$health_pol_19,1,2)),
                                       TRUE ~ NA)

#### 3. Combine final variables from subject data ####

vars_of_interest <- c(
  "c_id",
  "REGION_0",
  "gender",
  "age", 
  "vac_hes_4_1", 
  "vac_hes_7", 
  "ideology",
  "food_indx",
  "food_pca",
  "hh_inc_delta", 
  "hh_inc_obj",
  "eq5d_rate_now", 
  "eq5d_rate_delta",
  "who_indx",
  "who_pca",
  "politics_1",
  "politics_2",
  "politics_3",
  "politics_4",
  "politics_5",
  "politics_6",
  "politics_7",
  "politics_8",
  "politics_9",
  "politics_10",
  "health_pol_1",
  "health_pol_13",
  "health_pol_17",
  "health_pol_18",
  "politics_diff_1",
  "politics_diff_2",
  "politics_diff_3",
  "politics_diff_4",
  "politics_diff_5",
  "income_abovemed", 
  "covidexp_index",
  "covidexp_index_abovemed",
  "EDUCATION_LEVEL", 
  "attention", 
  "dep_children", 
  "marital_status",
  "subj_vaccinated",
  "subj_vacc_refused",
  "gov_relect",
  "gov_rate",
  "health_spend_pca",
  "health_compl_pca",
  "intl_flights",
  "weights")

subject_data_final <- subject_data %>% 
  left_join(recode_data %>% 
              select(id, 
                     country, 
                     all_of(c("income_abovemed", 
                              "covidexp_index",
                              "covidexp_index_abovemed"))),
            by = c("country","id")) %>% 
  mutate(country = country_to_code(country)) %>% 
  mutate(c_id = paste0(country,"_",id)) %>% 
  select(all_of(vars_of_interest)) %>% 
  distinct()

#### 4. Add in OxCGRT variables ####

oxcgrt <- read.csv("additional_data/OxCGRT_nat.csv")
oxcgrt_sub <- read.csv("additional_data/OxCGRT_subnat.csv")

# Remove 59 rows are NA for China
oxcgrt <- oxcgrt[complete.cases(oxcgrt$C1M_School.closing),]

oxcgrt_frame <- subject_data %>% 
  mutate(date = lubridate::as_date(StartDate)) %>% 
  group_by(country) %>% 
  summarise(Begin_date = min(date)) %>% 
  rename(CountryName = country) %>% 
  mutate(CountryName = case_when(CountryName == "UK" ~ "United Kingdom",
                                 CountryName == "US" ~ "United States",
                                 TRUE ~ CountryName))

# oxcgrt_frame <- data.frame(
#   CountryName = c("Australia", "Brazil", "Canada", "Chile", "China", 
#                "Colombia", "France", "Ghana", "India", "Italy", 
#                "Japan", "South Africa", "Spain", "United Kingdom",
#                "United States", "Uganda"),
#   Begin_date = c(20220301, 20230605, 20230228, 20220505,
#                  20230502, 20230516, 20230502, 20220627,
#                  20230302, 20230503, 20220610, 20221017,
#                  20230517, 20230228, 20230127, 20220518)
# )


oxcgrt_vars <- oxcgrt %>% left_join(oxcgrt_frame, by = "CountryName") %>% 
  mutate(Date = as_date(ymd(Date))) %>% 
  filter(!is.na(Begin_date),
         Date < Begin_date) %>% 
  select(CountryName, 
         Date,
         Begin_date,
         matches("C[1-9][a-zA-Z]*_*"),
         starts_with("E1_"),
         starts_with("E2_")) %>% 
  select(!ends_with("_Flag")) %>% 
  group_by(CountryName) %>% 
  pivot_longer(cols = !all_of(c("CountryName","Date","Begin_date")),
               names_to = "variable",
               values_to = "measure") %>% 
  group_by(CountryName, variable) %>% 
  summarise(days_under_measures = sum(measure > 0)) %>% 
  mutate(variable = str_sub(variable,1,2)) %>% 
  pivot_wider(id_cols = "CountryName",
              names_from = "variable",
              values_from = "days_under_measures") %>% 
  rename(country = CountryName) %>% 
  mutate(country = case_when(country == "United Kingdom" ~ "UK",
                             country == "United States" ~ "US",
                             .default = country),
         country = country_to_code(country)) %>% 
  ungroup()

##### Subnational variables #####

oxcgrt_subframe <- oxcgrt_frame %>% 
  filter(CountryName %in% unique(oxcgrt_sub$CountryName))

oxcgrt_subvars <- oxcgrt_sub %>% left_join(oxcgrt_subframe, by = "CountryName") %>% 
  mutate(Date = as_date(ymd(Date))) %>% 
  filter(!is.na(Begin_date),
         Date < Begin_date,
         Jurisdiction == "STATE_TOTAL") %>% 
  select(CountryName, 
         RegionName,
         Date,
         Begin_date,
         matches("C[1-9][a-zA-Z]*_*"),
         starts_with("E1_"),
         starts_with("E2_")) %>% 
  select(!ends_with("_Flag")) %>% 
  group_by(CountryName, RegionName) %>% 
  pivot_longer(cols = !all_of(c("CountryName","RegionName","Date","Begin_date")),
               names_to = "variable",
               values_to = "measure") %>% 
  group_by(CountryName, RegionName, variable) %>% 
  summarise(days_under_measures = sum(measure > 0)) %>% 
  mutate(variable = str_sub(variable,1,2)) %>% 
  pivot_wider(id_cols = c("CountryName","RegionName"),
              names_from = "variable",
              values_from = "days_under_measures") %>% 
  rename(country = CountryName) %>% 
  rename_with( ~ paste0(.x,"_sub"), matches("[C|E][1-8]")) %>% 
  mutate(country = case_when(country == "United Kingdom" ~ "UK",
                             country == "United States" ~ "US",
                             .default = country),
         country = country_to_code(country),
         RegionName = str_to_lower(RegionName),
         RegionName = case_when(RegionName == "washington dc" ~ "district of columbia",
                                RegionName == "xinjiang" ~ "xinjiang uyghur",
                                RegionName == "ningxia" ~ "ningxia hui",
                                RegionName == "guangxi" ~ "guangxi zhuang",
                                TRUE ~ RegionName)) %>% 
  ungroup()

##### Containment and economic policy PCA scores #####

C_pca_mod <- princomp(~., oxcgrt_vars %>% select(matches("C[1-9]")))
E_pca_mod <- princomp(~., oxcgrt_vars %>% select(matches("E[1-9]")))

pca_mods[['C']] <- C_pca_mod
pca_mods[['E']] <- E_pca_mod

oxcgrt_vars$C_pca <- predict(C_pca_mod)[,1]
oxcgrt_vars$E_pca <- predict(E_pca_mod)[,1]

#### 5. Regime classification ####

whogov_classification <- read_csv("additional_data/whogov.csv") %>% 
  group_by(country_isocode) %>% 
  filter(year == 2022) %>% 
  mutate(country_isocode = case_when(country_isocode == "BRA" ~ "BR",
                                     country_isocode == "FRA" ~ "FR",
                                     country_isocode == "ITA" ~ "IT",
                                     country_isocode == "ESP" ~ "SP",
                                     country_isocode == "GBR" ~ "UK",
                                     country_isocode == "USA" ~ "US",
                                     TRUE ~ country_isocode
  )) %>% 
  filter(country_isocode %in% countries) %>% 
  mutate(system = case_when(str_detect(system_category,"dictator") ~ "Dictatorship",
                            str_detect(system_category,"Parliamentary") ~ "Parliamentary",
                            str_detect(system_category,"Presidential") ~ "Presidential",
                            TRUE ~ "Other")) %>% 
  select(country_isocode, system)
  
#### 6. Political constraint measure ####

polcon_data <- readxl::read_xlsx("additional_data/polcon.xlsx") %>% 
  mutate(ctrynm = case_when(ctrynm == "BRA" ~ "BR",
                            ctrynm == "FRN" ~ "FR",
                            ctrynm == "ITA" ~ "IT",
                            ctrynm == "SPN" ~ "SP",
                            ctrynm == "UKG" ~ "UK",
                            ctrynm == "USA" ~ "US",
                            ctrynm == "SAF" ~ "ZAF",
                            TRUE ~ ctrynm
  )) %>% 
  filter(year == 2021,
         ctrynm %in% countries) %>% 
  select(ctrynm, POLCONV_VDEM)

#### 7. Clean, merge, and write output ####

final_data <- pooled_data %>% 
  mutate(c_id = paste0(country,"_",id)) %>% 
  left_join(subject_data_final, by = "c_id") %>% 
  filter(!is.na(id)) %>% # 1 Ghanaian participant
  left_join(panel_data %>% select(c_id, deaths_w2, cases_w2, pop2019_w2), by = "c_id") %>% #E1_1_w2, E1_2_w2
  left_join(oxcgrt_vars, by = "country") %>% 
  mutate(region_merge = str_to_lower(stringi::stri_trans_general(REGION_0, "Latin-ASCII")),
         region_merge = gsub(" / (.*)","", region_merge)) %>% 
  left_join(oxcgrt_subvars, by = c("region_merge" = "RegionName", "country")) %>% 
  left_join(whogov_classification, by = c("country" = "country_isocode")) %>% 
  left_join(polcon_data, by = c("country" = "ctrynm"))

write_csv(final_data, "formatted_data/conjoint_data.csv")
write_dta(final_data, "formatted_data/conjoint_data.dta")

zip("formatted_data/conjoint_data.dta.zip","formatted_data/conjoint_data.dta")
file.remove("formatted_data/conjoint_data.dta")

#### Extra steps ####

##### E1. Save all PCA mods for analysis #####

write_rds(pca_mods, "models/pca_recoding_mods.RDS")

##### E2. Push to git #####

# system(paste0("cd '",getwd(),"'"))
# system("git add .")
# system(paste0("git commit -m 'data_refresh_",format(Sys.time(), "%d_%m_%y"),"'"))
# system("git push")










#### ARCHIVE
# oxcgrt_country <- oxcgrt_frame
# count <- 0
# for (i in 1:8) {
#   column <- 5 + 2 * i
#   max_val <- max(oxcgrt[, column], na.rm = TRUE)
#   
#   for (k in 0:max_val) {
#     count <- count + 1
#     
#     for (j in 1:nrow(oxcgrt_country)) {
#       oxcgrt_country[j, paste0("C", i, "_", k, "_w2")] <- sum(
#         oxcgrt[oxcgrt$Date < oxcgrt_country$Begin_date[j] & oxcgrt$CountryName == oxcgrt_country$Country[j], column] == k
#         
#       )
#     }
#   }
# }
# 
# count <- 0
# for (i in 1:2) {
#   column <- 20 + 2 * i
#   max_val <- max(oxcgrt[, column], na.rm = TRUE)
#   
#   for (k in 0:max_val) {
#     count <- count + 1
#     
#     for (j in 1:nrow(oxcgrt_country)) {
#       oxcgrt_country[j, paste0("E", i, "_", k, "_w2")] <- sum(
#         oxcgrt[oxcgrt$Date < oxcgrt_country$Begin_date[j] & oxcgrt$CountryName == oxcgrt_country$Country[j], column] == k
#         
#       )
#     }
#   }
# }
# 
# colnames(oxcgrt_country)[1] <- "country"
# 
# oxcgrt_final <- oxcgrt_country[,str_detect(colnames(oxcgrt_country), "_0_", negate = TRUE)]
# 
# princomp(oxcgrt_final[,!(colnames(oxcgrt_final) %in% c("E1_1_w2","E1_2_w2","E2_1_w2", "E2_2_w2"))])
# 
# oxcgrt_final$country <- country_to_code(oxcgrt_final$country)